In [23]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context="notebook", style="white", palette=sns.color_palette("RdBu"))

import numpy as np
import pandas as pd
import scipy.io as sio

import sys
sys.path.append('..')

from helper import recommender as rcmd

load data and setting up

% Notes: X - num_movies (1682) x num_features (10) matrix of movie features % Theta - num_users (943) x num_features (10) matrix of user features % Y - num_movies x num_users matrix of user ratings of movies % R - num_movies x num_users matrix, where R(i, j) = 1 if the % i-th movie was rated by the j-th user

In [24]:
movies_mat = sio.loadmat('./data/ex8_movies.mat')
Y, R = movies_mat.get('Y'), movies_mat.get('R')

Y.shape, R.shape


Out[24]:
((1682, 943), (1682, 943))

In [25]:
m, u = Y.shape
# m: how many movies
# u: how many users

n = 10  # how many features for a movie

In [26]:
param_mat = sio.loadmat('./data/ex8_movieParams.mat')
theta, X = param_mat.get('Theta'), param_mat.get('X')

theta.shape, X.shape


Out[26]:
((943, 10), (1682, 10))

cost


In [27]:
# use subset of data to calculate the cost as in pdf...
users = 4
movies = 5
features = 3

X_sub = X[:movies, :features]
theta_sub = theta[:users, :features]
Y_sub = Y[:movies, :users]
R_sub = R[:movies, :users]

param_sub = rcmd.serialize(X_sub, theta_sub)

rcmd.cost(param_sub, Y_sub, R_sub, features)


Out[27]:
22.224603725685675

In [28]:
param = rcmd.serialize(X, theta)  # total real params

rcmd.cost(rcmd.serialize(X, theta), Y, R, 10)  # this is real total cost


Out[28]:
27918.64012454421

gradient


In [29]:
n_movie, n_user = Y.shape

X_grad, theta_grad = rcmd.deserialize(rcmd.gradient(param, Y, R, 10),
                                      n_movie, n_user, 10)


In [30]:
assert X_grad.shape == X.shape
assert theta_grad.shape == theta.shape

regularized cost


In [31]:
# in the ex8_confi.m, lambda = 1.5, and it's using sub data set
rcmd.regularized_cost(param_sub, Y_sub, R_sub, features, l=1.5)


Out[31]:
31.344056244274221

In [32]:
rcmd.regularized_cost(param, Y, R, 10, l=1)  # total regularized cost


Out[32]:
32520.682450229557

regularized gradient


In [33]:
n_movie, n_user = Y.shape

X_grad, theta_grad = rcmd.deserialize(rcmd.regularized_gradient(param, Y, R, 10),
                                                                n_movie, n_user, 10)

assert X_grad.shape == X.shape
assert theta_grad.shape == theta.shape

parse movie_id.txt


In [34]:
movie_list = []

with open('./data/movie_ids.txt', encoding='latin-1') as f:
    for line in f:
        tokens = line.strip().split(' ')
        movie_list.append(' '.join(tokens[1:]))

movie_list = np.array(movie_list)

reproduce my ratings


In [35]:
ratings = np.zeros(1682)

ratings[0] = 4
ratings[6] = 3
ratings[11] = 5
ratings[53] = 4
ratings[63] = 5
ratings[65] = 3
ratings[68] = 5
ratings[97] = 2
ratings[182] = 4
ratings[225] = 5
ratings[354] = 5

prepare data


In [36]:
Y, R = movies_mat.get('Y'), movies_mat.get('R')


Y = np.insert(Y, 0, ratings, axis=1)  # now I become user 0
Y.shape


Out[36]:
(1682, 944)

In [37]:
R = np.insert(R, 0, ratings != 0, axis=1)
R.shape


Out[37]:
(1682, 944)

In [58]:
n_features = 50
n_movie, n_user = Y.shape
l = 10

In [59]:
X = np.random.standard_normal((n_movie, n_features))
theta = np.random.standard_normal((n_user, n_features))

X.shape, theta.shape


Out[59]:
((1682, 50), (944, 50))

In [60]:
param = rcmd.serialize(X, theta)

normalized ratings


In [61]:
Y_norm = Y - Y.mean()
Y_norm.mean()


Out[61]:
4.6862111343939375e-17

training


In [62]:
import scipy.optimize as opt

In [63]:
res = opt.minimize(fun=rcmd.regularized_cost,
                   x0=param,
                   args=(Y_norm, R, n_features, l),
                   method='TNC',
                   jac=rcmd.regularized_gradient)

In [64]:
res


Out[64]:
     fun: 64721.49781507616
     jac: array([ -1.58635281e-06,  -4.38201786e-08,  -8.32037088e-07, ...,
         1.99915339e-07,   4.98546832e-07,  -2.27175198e-07])
 message: 'Converged (|f_n-f_(n-1)| ~= 0)'
    nfev: 2505
     nit: 74
  status: 1
 success: True
       x: array([  7.66989411e-01,   7.56774960e-01,   1.00894310e+00, ...,
         8.16557591e-01,   1.72679929e-04,   1.09739346e-01])

In [65]:
X_trained, theta_trained = rcmd.deserialize(res.x, n_movie, n_user, n_features)
X_trained.shape, theta_trained.shape


Out[65]:
((1682, 50), (944, 50))

In [66]:
prediction = X_trained @ theta_trained.T

In [67]:
my_preds = prediction[:, 0] + Y.mean()

In [68]:
idx = np.argsort(my_preds)[::-1]  # Descending order
idx.shape


Out[68]:
(1682,)

In [69]:
# top ten idx
my_preds[idx][:10]


Out[69]:
array([ 4.12534978,  4.04414835,  3.99324636,  3.91902945,  3.81691251,
        3.81556458,  3.76602976,  3.76323186,  3.75906567,  3.75077289])

In [70]:
for m in movie_list[idx][:10]:
    print(m)


Titanic (1997)
Star Wars (1977)
Shawshank Redemption, The (1994)
Forrest Gump (1994)
Raiders of the Lost Ark (1981)
Braveheart (1995)
Return of the Jedi (1983)
Usual Suspects, The (1995)
Godfather, The (1972)
Schindler's List (1993)

In [ ]: